*******************************************************************************
**
**
** This script uses the HIMSS data to create two files: 
**	(a) Hospital-year panel w/ indicators for each EMR vendor +  application
** 	(b) Hospital-year panel w/ counts of each EMR vendor + application
**
**
*******************************************************************************



set more off
clear
capture log close

**
** Settings and directories
** 
local fpath_himss_main = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/himss/"
local fpath_input = "`fpath_himss_main'input"
local fpath_output = "`fpath_himss_main'output"

local fpath_main = "/disk/agedisk4/medicare.work/sacarny-DUA51934/shruthi-dua51934/replication_files/build/output"

log using "`fpath_himss_main'/prepare.log", replace

global STARTYEAR = 2003
global ENDYEAR = 2014

* convert data from csv to stata format?
global CONVERTDATA = 0

* make the application files? 
global MAKE_APP = 0

* make the indicator files? 
global MAKE_INDIC = 0

* roll up the converted indicator files and clean up? 
global ROLLUP_INDIC = 0

* roll up the converted application files and clean up?
global ROLLUP_APP = 1

* save a vendor indicator file at the hospital-year-vendor level?
* this is in effect only when ROLLUP_APP == 1
* note: this output is required for vendor_distance.R!
global SAVE_VENDOR_DIST = 1


* convert from csv to stata format 
if ($CONVERTDATA) {
	* make sure to import medicare pn as string
	* make sure the himss entity id is numeric 
	forvalues year=$STARTYEAR/$ENDYEAR {
		di "the year is `year'"
		
		if (`year' != 2004) & (`year' != 2003) & (`year' != 2011) & (`year' != 2006) {
			import delimited using "`fpath_input'/src/`year'/HAEntity.csv", clear stringcols(10)
			
			destring haentityid, replace
		  
		} 
		
		if `year' == 2004 | `year' == 2003 {
			import delimited using "`fpath_input'/src/`year'/HAEntity.csv", clear stringcols(3)
			
			* destring facility id
			rename facilityid haentityid
			capture confirm numeric variable haentityid 
			if _rc {	
				gen byte notnumeric = real(haentityid) == .		
				drop if notnumeric 		
				drop notnumeric 
			} 
			destring haentityid , replace			

		}		
		
		* 2011 has one observation that has string haentityid 
		if `year' == 2011 | `year' == 2006 {
			import delimited using "`fpath_input'/src/`year'/HAEntity.csv", clear stringcols(10)

			gen byte notnumeric = real(haentityid) == .		
			drop if notnumeric 		
			drop notnumeric 
			
			destring haentityid, replace
		}

		* save the cleaned entity file 
		save "`fpath_input'/src/`year'/HAEntity.dta", replace
		
		* now clean up the application files 
		import delimited using "`fpath_input'/src/`year'/HAEntityApplication.csv", clear
	
		* make sure the id variable is numeric in 2003-2004
		if `year' == 2004 | `year' == 2003 {
			rename facilityid haentityid
			
			*clean up application names 
			replace application = proper(application)
			
			* standardize the EMR / HIM categories 
			gen category = "Electronic Medical Record" if strpos(application, "Clinical Data Repository") | strpos(application, "Computerized Practitioner Order Entry") | strpos(application, "Cpoe")  | strpos(application, "Documentation") | strpos(application, "Clinical Decision Support") | strpos(application, "Order Entry") | strpos(application, "Electronic Med") 
			
			replace category = "Health Information Management (HIM)" if strpos(application, "Encoder") | strpos(application, "Abstracting") | strpos(application, "Chart Deficiency") | strpos(application, "Dictation") | strpos(application, "Chart Tracking") | strpos(application, "Transcription")			
		}
	
		* make sure the id variable is numeric here too 
		if `year' == 2012 { 
			gen byte notnumeric = real(haentityid) == .		
			drop if notnumeric 		
			drop notnumeric 
			destring haentityid, replace 
		} 
		
		* save the application file 
		save "`fpath_input'/src/`year'/HAEntityApplication.dta", replace
	}
}


** make the indicator and application files 
if $MAKE_INDIC { 

	forvalues year=$STARTYEAR/$ENDYEAR {

		di "THE YEAR IS `year'"
	
		* bring in hospital list
		use "`fpath_input'/src/`year'/HAEntity.dta", clear

		* reduce to hospital 
		if `year' == 2003 | `year' == 2004 {
			rename typefacility haentitytype 
			tostring haentitytype , replace 
			replace haentitytype = "Hospital" if haentitytype == "1" 
		}
		keep if haentitytype=="Hospital"
	
	
		* keep the HIMSS entity id, medicare, and full time employees in IS 
		keep haentityid medicarenumber
		rename medicarenumber hospid
	
		* save list of hospitals w/ pn 
		tempfile hospitallist
		isid haentityid
		sort haentityid
		save `hospitallist'

		* bring in EMR applications only (from the application files)
		use "`fpath_input'/src/`year'/HAEntityApplication.dta" if category=="Electronic Medical Record" | category == "Health Information Management (HIM)" | category == "Transcription" | category == "Document/Forms Management" 

	* make the application files 
	if $MAKE_APP {
		preserve 
		* bring in EMR and HIM applications only from the cleaned application files
		use haentityid category status application software* hardware* using ///
		"`fpath_input'/src/`year'/HAEntityApplication.dta" if ///
		category=="Electronic Medical Record" | category == "Health Information Management (HIM)" | category == "Transcription" | category == "Document/Forms Management" 
		
		if `year' == 2003 | `year' == 2004 {
		rename softwarevendor softwarevendorname
		}
		merge m:1 haentityid using `hospitallist', keep(match)
		rename hospid pn  
		
		drop haentityid 
		gen year = `year' 
	
		tostring pn, replace
		cap drop _merge 
		save "`fpath_input'/src/himss-emr-application`year'.dta", replace 
		clear 
		restore
		} 
	
	* make the indicator files 
	if $MAKE_INDIC {		
		* only want live/operational, to be replaced, replaced, and automated (see 2006 documentation for descriptions)
		tab status
		replace status = proper(status)  
		keep if status=="Live And Operational" | (status == "To Be Replaced") | (status == "Replaced") | (status=="Automated") 
	* b/c "We did not code “To be Replaced” as having an EMR because there is no indication of
	* whether the system to be replaced is a live and operational enterprise EMR. We created
	* the CPOE variable in the same way."
	* doi: 10.1007/s10729-011-9179-2 (Zhivan and Diana 2012)
	* see also doi: 10.1007/s10729-011-9165-8 (Kazley et al 2011)
	   
	tab application
	replace application = proper(application)
	
	* EMR
	* software application name -- should be exhaustive categories 
	* cdr or emr: Clinical data repository, EMR, Enterprise EMR, Electronic Med Admin Record 
	gen byte cdr_emr = (strpos(application, "Clinical Data Repository")  > 0) | (strpos(application, "Emr") > 0) | (application == "Electronic Med Admin Record (Emar)")
	gen byte cds = (strpos(application, "Clinical Decision Support") > 0) 
	gen byte pd = (application == "Clinical Documentation") | (application == "Physician Documentation")
	gen byte cpoe = (strpos(application, "Computerized Practitioner Order Entry") > 0) | (strpos(application, "Cpoe") > 0)
	gen byte oe = (cpoe == 0) & (strpos(application, "Order Entry") > 0)
		
	* 2009 adds physician portal
	gen byte pp = application=="Physician Portal"

	* 2012 adds business intelligence - clinical and patient portal
	gen byte busi = (strpos(application, "Business Intelligence") > 0)
	gen byte patp = (strpos(application, "Patient Portal") > 0)
	
	* 2005 adds medical terminology, removed in 2007 
	gen byte mt = (strpos(application, "Medical Terminology")>0) 
	
	* 2014 adds clinical data repository/ EMR 
	*gen byte cemr = application=="Clinical Data Repository/EMR" 

	* HIM
	* software application names: 
	gen byte ab = application=="Abstracting" 
	gen byte cd = application=="Chart Deficiency" 
	gen byte ctl = application=="Chart Tracking/Locator"
	gen byte di = strpos(application, "Dictation") > 0 
	gen byte enc = application=="Encoder" | application=="Computer Assisted Coding" 
	gen byte tr = strpos(application, "Transcription") > 0 
	gen byte docm = strpos(application, "Document Management") > 0 
	gen byte ef = strpos(application, "Electronic Forms") > 0 
	
	
	* data check
	egen test = rowtotal(cdr_emr cds cpoe oe pd pp busi patp mt ab cd ctl di enc tr docm ef)
	assert test == 1
	drop test 
	
	* down to the facility level
	collapse (max) cdr_emr cds cpoe oe pd pp busi patp mt ab cd ctl di enc tr docm ef, ///
			 by(haentityid)

	compress

	* merge to the hospital list to reduce to only hospitals and bring in medicare pn
	merge 1:1 haentityid using `hospitallist', keep(match using)
	tab _merge
	
	* reformat the id variables 
	tostring haentityid, replace 
	tostring hospid, replace		

	* hospitals that lacked any EMR application entries will get missing values
	* eg hospital doesn't show up in the HAapplication list 
	foreach var of varlist cdr_emr cds cpoe oe pd pp busi patp mt ab cd ctl di enc tr docm ef {

		replace `var' = 0 if missing(`var') 
		assert !missing(`var')
		
	}
	
	drop _merge 
	gen fyear = `year'
	
	
	* replace the label 
	label variable haentityid "HIMSS id - unique to year"
	label variable fyear "Year"
	label variable hospid "Medicare provider number"
	label variable cdr_emr "CDR or EMR"
	label variable cds "Clinical Decision Support"
	label variable cpoe "Computerized Practitioner Order Entry (CPOE)"
	label variable oe "Order Entry (Includes Order Communications)"
	label variable pd "Physician Documentation"
	label variable pp "Physician Portal"
	label variable busi "Business Intelligence - Clinical" 
	label variable patp "Patient Portal"
	label variable cd "Chart Deficiency"
	label variable ab "Abstracting"
	label variable ctl "Chart Tracking/Locator"
	label variable di "Dictation"
	label variable enc "Encoder or Computer Assisted Encoding" 
	label variable tr "Transcription"
	
	save "`fpath_input'/src/himss-emr-indic`year'.dta", replace
	
	* remove any row with blank pn
	drop if missing(hospid)

	* down to medicare pn - year level
	collapse ///
		(max) cdr_emr cds cpoe oe pd pp busi patp mt ab cd ctl di enc tr docm ef, /// 
		by(hospid fyear)
		
	* EMR : basic/ advanced 
	* indicator for basic emr ONLY: cdr/cds/oe and NOTHING else
	gen byte emr_basiconly = (cdr_emr | cds | oe) & !(cpoe | pd | patp | busi | pp)
	
	* indicator for at least basic: 
	gen byte emr_atlbasic = (cdr_emr | cds | oe)
	
	* indicator for at least advanced emr: 
	gen byte emr_atladv = (cpoe | pd) 
	
	* indicator for ANY emr: 
	gen byte emr_anyv1 = (cdr_emr | cds | cpoe | oe | pd | pp | busi | patp)
	
	label variable hospid "Medicare provider number"
	label variable emr_atlbasic "At least basic: cdr/ cds/ oe/ cpoe"
	label variable emr_atladv "At least advanced: cpoe/ pd" 
	label variable emr_anyv1 "Any type of EMR"

	sort hospid

	save "`fpath_input'/src/himss-emr-pn`year'.dta", replace
	}
}
}


* roll up the files across all years and save 
if $ROLLUP_INDIC { 
foreach base in himss-emr-indic himss-emr-pn {

	if ("`base'"=="himss-emr-indic") {
		local id "haentityid"
		
	}
	else {
		local id "hospid"
	}

	clear
	forvalues year=$STARTYEAR/$ENDYEAR {
		di "The year is `year' and the base is `base'"
	
		append using "`fpath_input'/src/`base'`year'.dta"
		*rm `base'`year'.dta
	}
	
	sort `id' fyear
	
	rename `id' pn
	label var pn "PN"
	
	rename fyear year 
	
	save "`fpath_input'/`base'.dta", replace
	}
}

if $ROLLUP_APP { 
	clear 
	
	* append together the application files from each year 
	forvalues year=$STARTYEAR/$ENDYEAR {
		di "The year is `year'"
		append using "`fpath_input'/src/himss-emr-application`year'.dta"
		sort pn year
		label var pn "PN"
		* rm himss-emr-application`year'.dta 		
		save "`fpath_input'/himss-emr-application.dta", replace
	}

	* bring in the main analytic file 
	use id pn year target target2 acq_legacy acq_other forprofit acqhosp using "`fpath_main'/acq_cleaned_complete_20230606.dta" , clear 	
	
	
	* merge with the application file 
	merge 1:m pn year using "`fpath_input'/himss-emr-application"
	
	* keep only if (matched) OR (unmatched and found in the main analytic file)
	keep if _merge == 1 | _merge == 3 
	rename _merge himss_merge_flag	

	* drop if missing category and clean up 
	drop if missing(category)
	replace category = "Health Information Management (HIM)" if (category == "Transcription")
	replace category = "Health Information Management (HIM)" if (category == "Document/Forms Management")
	gen byte emr = (category == "Electronic Medical Record")
	gen byte him = (category == "Health Information Management (HIM)")  
	replace category = "emr" if emr == 1
	replace category = "him" if him == 1

	* data check
	assert emr + him == 1 

	* clean up application type names 
	replace application = "Transcription" if strpos(application, "Transcription")

	* clean up status names 
	replace status = proper(status)
	gen byte tbr = (status == "To Be Replaced") | (status == "Replaced") 
	gen byte lo = (status == "Live And Operational") | (status == "Automated")

	* keep ONLY the live and operational + to be replaced apps
	keep if tbr == 1 | lo == 1 

	* data check
	assert tbr + lo == 1 


	* clean up vendor names using vendor_concordance.xlsx
	rename softwarevendorname vendor 
	drop if missing(category)
	replace vendor = proper(vendor)
	replace vendor = "Medhost" if strpos(vendor, "Healthcare Management Systems")
	replace vendor = "Mckesson" if strpos(vendor, "Mckesson")
	replace vendor = "NTT Data Corp" if strpos(vendor, "Ntt") | strpos(vendor, "Keane")
	replace vendor = "Siemens Corp" if strpos(vendor, "Siemens")
	replace vendor = "Allscripts" if strpos(vendor, "Misys") |strpos(vendor, "Eclipsys")
	replace vendor = "Thomson Corp" if strpos(vendor, "Thomson")| strpos(vendor, "Solucient")
	replace vendor = "M*Modal" if strpos(vendor, "Medquist")
	replace vendor = "3m Health Information Systems" if strpos(vendor, "Softmed")| strpos(vendor, "3m") | strpos(vendor, "3M") 
	replace vendor = "Nuance Communications" if strpos(vendor, "Dictaphone")
	replace vendor = "Blueware" if strpos(vendor, "Blueware")
	replace vendor = "Agfa Healthcare" if strpos(vendor, "Agfa") 
	replace vendor = "Nuance" if strpos(vendor, "Nuance") 
	replace vendor = "Quadramed" if strpos(vendor, "Quadramed") 
	replace vendor = "Mdi" if strpos(vendor, "Mdi ")
	replace vendor = "Dolbey" if strpos(vendor, "Dolbey")
	replace vendor = "Cerner" if strpos(vendor, "Cerner") 

	gen commaname = strpos(vendor, ",")-1
	replace vendor = substr(vendor, 1, commaname) if commaname > 0
	drop commaname 
	duplicates drop 
	
	* save the vendor observations w/ acquirer classifications if needed for vendor distance calculation 
	if $SAVE_VENDOR_DIST { 
		preserve 
		* make sure that the non-acquirer hospitals are for-profit 
		keep if forprofit == 1 

		keep id year acqhosp target target2 acq_legacy acq_other vendor 
		duplicates drop 
		
		* collapse and store the variable labels 
		tab vendor, gen(v_)
		
		foreach v of varlist v_* {
			local l`v': variable label `v'		
		}
		
		collapse (sum) v_*  (max) acqhosp target target2 acq_legacy acq_other , by(id year)
		foreach v of varlist v_* {
			local tmp = subinstr("`l`v''", "vendor==", "", 1)
			local tmp = subinstr("`tmp'", " ", "", 10)
			label variable `v' "`tmp'" 	
			local tmp = subinstr("`tmp'", "-", "", 10)
			local tmp = subinstr("`tmp'", "&", "", 10)
			local tmp = subinstr("`tmp'", ".", "", 10)
			local tmp = subinstr("`tmp'", "*", "", 10)
			local tmp = subinstr("`tmp'", "Corporation", "Corp", 10)
			local tmp = subinstr("`tmp'", "Technologies", "Tech", 10)
			local tmp = substr("`tmp'", 1, 20)
			*di "`tmp'"
			rename `v' v_`tmp'
		}
		

		* fill in "missing adoption" years
		egen id2 = group(id)
		tsset id2 year 		
		foreach v of varlist v_* {
			replace `v' = 1 if `v' == 0 & (L.`v' == 1 & F.`v' == 1) 
		} 		
		drop id2 		
		saveold "`fpath_output'/vendor_distance_v3_20230606.dta", v(12) replace 
		restore 
	} 
	* count vendors by hospital and save to csv for vendor share plots 
	preserve 
	* count the number of applications for each vendor in each category 
	bys year category vendor: gen napp_vendor_cat = _N 

	* count the number of applications from each vendor in each hospital in each year: 
	bys id year vendor: gen napp_vendor = _N 

	* collapse 
	duplicates drop id year category vendor napp_vendor_cat napp_vendor, force

	* save
	keep if ((year == 2005) | (year == 2006) | (year == 2014)) & (forprofit == 1)
	keep id year vendor category app napp* acqhosp target target2 acq_legacy acq_other  forprofit
	outsheet using "`fpath_output'/count_vendors_20230306.csv", comma names replace 
	restore
	
	save "`fpath_output'/acq_applications_status_20230306.dta", replace 	
	
}




log close
